Aesthetics are visual properties that are used to display your variables. Aesthetics can include position on an x or y axis, line type, colours, shapes etc. The process of assigning variables from a dataset to an aesthetic is known as mapping. + position: x, y + size + colour/fill + shape + width + line type + alpha/transparency
Refers to geometric objects as ‘geoms’ for short. ex. Points and Lines are the geoms used to represent date and ozone levels, which were mapped to x and y aesthetics
Stats is short for ‘statistical transformations’.Examples of other common statistical transformations include quartiles used in boxplots, density estimates for probability distributions, a line of best fit from a linear regression, statistical summaries (means, medians, error bars etc.), and counts (frequencies, proportions and percentages).
Scales are used to control the mapping between a variable and an aesthetic.
Position adjustments aim to avoid overlapping elements by either dodging, filling, jittering, nudging or stacking. Layers can incorporate multiple position adjustments as follows.
The coordinate system is used to define the plane to which the data or stats are mapped.
Faceting is the process of breaking a visualisation into subsets and displaying the subsets as small multiples.
Cars <- read.csv('Cars.csv')
Cars$Sports <- Cars$Sports %>% factor(levels=c(0,1),labels=c('No','Yes'), ordered=TRUE)
Cars$Sport_utility <- Cars$Sport_utility %>% factor(levels=c(0,1),labels=c('No','Yes'), ordered=TRUE)
Cars$Wagon <- Cars$Wagon %>% factor(levels=c(0,1),labels=c('No','Yes'), ordered=TRUE)
Cars$Minivan <- Cars$Minivan %>% factor(levels=c(0,1),labels=c('No','Yes'), ordered=TRUE)
Cars$Pickup <- Cars$Pickup %>% factor(levels=c(0,1),labels=c('No','Yes'), ordered=TRUE)
Cars$All_wheel_drive <- Cars$All_wheel_drive %>% factor(levels=c(0,1),labels=c('No','Yes'), ordered=TRUE)
Cars$Rear_wheel_drive <- Cars$Rear_wheel_drive %>% factor(levels=c(0,1),labels=c('No','Yes'), ordered=TRUE)
Cars$Cylinders <- Cars$Cylinders %>% as.factor()
Cars_filter <- Cars %>% filter(Cylinders %in% c("4","6","8"))
ggplot() +
coord_cartesian() +
scale_x_date(name = "Date") +
scale_y_continuous(name = "Ozone (Mean ppb 13:00 - 15:00)")
# Create a data column named "date" based on columns "Month" and "Day"
View(airquality)
airquality$date <- as.Date(with(airquality, paste( Month, Day,sep="-")), "%m-%d")
#airquality
step1 <- ggplot() +# Add a points layer to the ggplot object
coord_cartesian() +
scale_x_date(name = "Date") +
scale_y_continuous(name = "Ozone (Mean ppb 13:00 - 15:00)") +
layer(
data=airquality,
mapping=aes(x=date, y=Ozone),
stat="identity",
geom="point",
position = position_identity()
)
step2 <- step1+layer(
data = airquality,
mapping = aes(x = date, y = Ozone),
stat ="identity",
geom ="line",
position = position_identity()
)
step3 <- step2+layer(
data = airquality,
mapping = aes(x = date, y = Ozone),
stat ="smooth",
params =list(method ="loess", span = 0.4, se = FALSE),
geom ="smooth",
position = position_identity()
)
p <- ggplot(data = airquality, aes(x = date, y = Ozone))
p + geom_point() +
geom_line(aes(group = 1)) +
geom_smooth(se = FALSE, span = 0.4) +
labs(
title = "Air Quality - New York 1973 (Roosevelt Island)",
x = "Date",
y = "Ozone (Mean ppb 13:00 - 15:00)"
)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 37 rows containing non-finite values (stat_smooth).
## Warning: Removed 37 rows containing missing values (geom_point).
qplot(x = Cylinders,data = Cars, geom = "bar")
### BOX PLOT —-
qplot(x = Cylinders, y = Kilowatts, data = Cars,geom = "boxplot")
qplot(x = Weight,y = Economy_city, data = Cars,geom = "point")
## Warning: Removed 16 rows containing missing values (geom_point).
### Transform variables —-
qplot(x = Weight,y = Economy_city, data = Cars,geom = "point", log = "xy")
## Warning: Removed 16 rows containing missing values (geom_point).
# Adjusted scaling ----
qplot(x = log(Weight),y = log(Economy_city), data = Cars,geom = "point")
## Warning: Removed 16 rows containing missing values (geom_point).
qplot(x = log(Weight),y = log(Economy_city), data = Cars,geom = "point",colour = Cylinders)
## Warning: Removed 16 rows containing missing values (geom_point).
# Trend lines ----
qplot(x = log(Weight),y = log(Economy_city), data = Cars,geom = "point") +
stat_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 16 rows containing non-finite values (stat_smooth).
## Warning: Removed 16 rows containing missing values (geom_point).
qplot(x = log(Weight),y = log(Economy_city), data = Cars,geom = "point") +
stat_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 16 rows containing non-finite values (stat_smooth).
## Warning: Removed 16 rows containing missing values (geom_point).
### Facet: + compare the relationship between a car’s power (measured using kilowatts) and its retail price —-
qplot(x = Kilowatts,y = Retail_price, data = Cars_filter,
geom = "point",colour = Cylinders) +
stat_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
qplot(x = Kilowatts,y = Retail_price, data = Cars_filter,
geom = "point", facets = Cylinders ~.) +
stat_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Bar chart with a single dimension: ----
qplot(x = Cylinders,data = Cars, geom = "bar")
# Histogram with a single dimension: ----
hist(Cars$Weight)
qplot(x = Weight,data = Cars,geom = "histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).
# Change bins: ----
qplot(x = Weight,data = Cars,geom = "histogram",bins = 40)
## Warning: Removed 2 rows containing non-finite values (stat_bin).
# 3.2.1 ggplot: A layered approach
Cars <- read.csv('Cars.csv')
Cars$Cylinders <- Cars$Cylinders %>% as.factor()
Cars_filter <- Cars %>% filter(Cylinders %in% c("4","6","8"))
# Box plot ----
p <- ggplot(data = Cars_filter, aes(x = Cylinders, y = Economy_city))
p + geom_boxplot()
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
# Add outlier.shape = NA to prevent the box plot from plotting outliers, ----
# which are already plotted by geom_point.
p <- ggplot(data = Cars_filter,aes(x = Cylinders, y = Economy_city))
p + geom_boxplot(outlier.shape = NA) +
geom_jitter()
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
## Warning: Removed 11 rows containing missing values (geom_point).
# Add Transparency ----
p <- ggplot(data = Cars_filter,aes(x = Cylinders, y = Economy_city))
p + geom_boxplot(outlier.shape = NA) +
geom_jitter(alpha = 1/5)
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
## Warning: Removed 11 rows containing missing values (geom_point).
# Labels ----
p <- ggplot(data = Cars_filter,aes(x = Cylinders, y = Economy_city))
p + geom_boxplot(outlier.shape = NA) + geom_jitter(alpha = 1/5) +
ylab("City Fuel Economy (km/L)") +
ggtitle("Smaller engines have better city fuel economy")
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
## Warning: Removed 11 rows containing missing values (geom_point).
# Add means ----
p <- ggplot(data = Cars_filter,aes(x = Cylinders, y = Economy_city))
p + geom_boxplot(outlier.shape = NA) + geom_jitter(alpha = 1/5) +
ylab("City Fuel Economy (km/L)") +
ggtitle("Smaller engines have better city fuel economy") +
stat_summary(fun.y=mean, colour="red", geom="point",shape = 17)
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
## Warning: Removed 11 rows containing non-finite values (stat_summary).
## Warning: Removed 11 rows containing missing values (geom_point).
# Themes ----
p <- ggplot(data = Cars_filter,aes(x = Cylinders, y = Economy_city))
p + geom_boxplot(outlier.shape = NA) +
geom_jitter(alpha = 1/5) +
ylab("City Fuel Economy (km/L)") +
ggtitle("Smaller engines have better city fuel economy") +
stat_summary(fun.y=mean, colour="red", geom="point",shape = 17) +
theme_minimal()
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
## Warning: Removed 11 rows containing non-finite values (stat_summary).
## Warning: Removed 11 rows containing missing values (geom_point).
# theme_bw()
# Scatter plots ----
p <- ggplot(data = Cars_filter,
aes(x = log(Kilowatts),
y = log(Economy_city),
colour = Cylinders))
p + geom_point()
## Warning: Removed 11 rows containing missing values (geom_point).
# Add the trend lines ----
p <- ggplot(data = Cars_filter,
aes(x = log(Kilowatts),
y = log(Economy_city),
colour = Cylinders))
p + geom_point() +
facet_grid(~ Cylinders) +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 11 rows containing non-finite values (stat_smooth).
## Warning: Removed 11 rows containing missing values (geom_point).
# Exercise ------------
str(Cars_filter)
## 'data.frame': 413 obs. of 19 variables:
## $ Vehicle_name : chr "Chevrolet Aveo 4dr" "Chevrolet Aveo LS 4dr hatch" "Chevrolet Cavalier 2dr" "Chevrolet Cavalier 4dr" ...
## $ Sports : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Sport_utility : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Wagon : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Minivan : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Pickup : int 0 0 0 0 0 0 0 0 0 0 ...
## $ All_wheel_drive : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Rear_wheel_drive: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Retail_price : int 23380 25170 29220 29620 32770 27340 30080 26540 27460 30920 ...
## $ Dealer_cost : int 21930 23604 27394 27768 30714 25698 28172 24964 25812 28992 ...
## $ Engine_size : num 1.6 1.6 2.2 2.2 2.2 2 2 2 2 2 ...
## $ Cylinders : Factor w/ 8 levels "-1","3","4","5",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Kilowatts : int 77 77 104 104 104 98 98 97 82 97 ...
## $ Economy_city : int 45 45 42 42 42 47 47 42 43 42 ...
## $ Economy_highway : int 55 55 60 60 60 58 58 53 58 53 ...
## $ Weight : int 1075 1065 1187 1214 1187 1171 1191 1185 1182 1182 ...
## $ Wheel_base : int 249 249 264 264 264 267 267 262 262 262 ...
## $ Length : int 424 389 465 465 465 442 442 427 427 427 ...
## $ Width : int 168 168 175 173 175 170 170 170 170 170 ...
Weight_T <- c(Cars_filter$Weight/907.1847)
p <- ggplot(data = Cars_filter,
aes(x = Cylinders,
y = Engine_size,
colour = Cylinders))
p + geom_boxplot()+
xlab("Lenght of the Car") +
ggtitle("Lenght of the car to Weight (in Tons) plot")+
facet_grid(~ Cylinders)
# Exercise ------------
Make_D <- separate(Cars_filter,Vehicle_name,into="Make",sep = " ")
## Warning: Expected 1 pieces. Additional pieces discarded in 413 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
Avg_Price <- Make_D %>% group_by(Make)%>% summarise("Avg_price"= mean(Retail_price))
## `summarise()` ungrouping output (override with `.groups` argument)
Avg_Price
## # A tibble: 42 x 2
## Make Avg_price
## <chr> <dbl>
## 1 Acura 85877.
## 2 Audi 86616.
## 3 BMW 86570.
## 4 Buick 61076.
## 5 Cadillac 100949.
## 6 Chevrolet 53174.
## 7 Chrvsler 51910
## 8 Chrysler 54689.
## 9 CMC 71450
## 10 Dodge 43251.
## # ... with 32 more rows
p <- ggplot(data = Avg_Price,
aes(x = Make,
y = Avg_price,
colour = Make))
p +geom_col()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
ggplot(Avg_Price, aes(Make, Avg_price, fill= Avg_price))+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Avg_Price %>%
arrange(desc(Avg_price)) %>%
ggplot(aes(x=Make, y=Avg_price, size=Avg_price, fill=Make)) +
geom_point(alpha=0.5, shape=21, color="black") +
scale_size(range = c(.1, 24), name="Population (M)") +
scale_fill_viridis(discrete=TRUE, guide=FALSE, option="A") +
ylab("Average Price") +
xlab("Make") +
theme(legend.position = "none")+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
sd <- Make_D %>% group_by(Make)%>% summarise("SD"= sd(Retail_price))
## `summarise()` ungrouping output (override with `.groups` argument)
sd$SD
## [1] 44378.015 27067.326 24919.513 12743.312 25104.510 21775.889
## [7] NA 12057.477 NA 11776.062 14293.851 18191.265
## [13] 10885.249 NA 10254.677 16088.416 16122.035 36970.814
## [19] 7966.218 9679.199 47639.060 25016.845 12502.786 9251.869
## [25] 2404.163 NA 48587.612 9773.320 4242.641 12514.414
## [31] 12944.292 9966.174 13550.038 100791.071 9096.989 8440.383
## [37] 1697.056 8177.839 7095.638 18044.859 25449.498 18072.732
ggplot(Avg_Price) +
geom_bar( aes(x=Make, y=Avg_price), stat="identity", fill="skyblue", alpha=0.7) +
geom_errorbar( aes(x=Make, ymin=Avg_price-sd$SD,
ymax=Avg_price+sd$SD), width=0.4, colour="blue", alpha=0.9, size=1.3)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
ylab("Average Retail Price") +
ggtitle("Price variation by Make")
# 3.3.1 Basic colour in R
Diamonds <- read.csv("Diamonds.csv")
Diamonds$cut<- factor(Diamonds$cut,
levels=c('Fair','Good','Very Good','Premium','Ideal'),
ordered=TRUE)
Diamonds$color<- factor(Diamonds$color,
levels=c('J','I','H','G','F','E','D'),
ordered=TRUE)
Diamonds$clarity<- factor(Diamonds$clarity,
levels=c('I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF'),
ordered=TRUE)
# Histogram
p1 <- ggplot(data = Diamonds, aes(carat))
p1 + geom_histogram() # Default chart
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p1 + geom_histogram(colour = "#FFFFFF") # boarder
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p1 + geom_histogram(colour = "#FFFFFF", fill = "#FF0000") # fill color
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p1 + geom_histogram(colour = "#FFFFFF", fill = "#FFAAAA") # Reducing fill color saturation
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Density plot
p2 <- ggplot(data = Diamonds, aes(carat,fill = cut))
p2 + geom_density(alpha = .2)
# fill color names
p1 + geom_histogram(colour = "white",fill = "darkolivegreen3")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# PlotHelper ----
# plotHelper(p2 + geom_density(alpha = .2) + scale_fill_manual(values = CPCOLS))
# define custom colors
p2 + geom_density(alpha = .2) +
scale_fill_manual(
values = c("#9A32CD","#FF8C00","#e31a1c","#66CD00","#1C86EE"))
# Exercise ---------
p1 <- ggplot(data = Cars, aes(Kilowatts))+
geom_histogram()+
geom_histogram(colour = "#FFFFFF", fill = "#FFAAAA")
p3 <- ggplot(data = Diamonds, aes(x = color,y = price,fill=cut))
p3 + geom_boxplot()
# user color breqwer to reflect an ordinal or sequential colour scale
p3 + geom_boxplot() + scale_fill_brewer()
# Change color hues ------
p3 + geom_boxplot() + scale_fill_brewer(palette = "Greens")
# Manual ColourBrewer --------
p3 + geom_boxplot() +
scale_fill_manual(values =c('#feebe2',
'#fbb4b9',
'#f768a1',
'#c51b8a',
'#7a0177'))
# Continuous colour scales -------
p4 <- ggplot(data = Cars, aes(x = Kilowatts, y = Retail_price, colour = Economy_highway))
p4 + geom_point()
# change color
p4 + geom_point() + scale_colour_gradient(low="blue",high="red")
# Representing missing values ----
p4 + geom_point() + scale_colour_gradient(low="blue",high="red", na.value = "green")
# Default continuous scales
p4 + geom_point() + scale_color_gradientn(colours = terrain.colors(6))
p4 + geom_point() + scale_color_gradientn(colours = rainbow(6))
p4 + geom_point() + scale_color_gradientn(colours = heat.colors(6))
p4 + geom_point() + scale_color_gradientn(colours = topo.colors(6))
p4 + geom_point() + scale_color_gradientn(colours = cm.colors(6))
p4 + geom_point() + scale_color_gradientn(colours = c('#feebe2','#fcc5c0','#fa9fb5',
'#f768a1','#c51b8a','#7a0177'))
# colorblindr:package is useful for testing a colour scheme for common forms of colour blindness -----
p5 <- ggplot(Diamonds, aes(x = log(price), fill = cut))
p5 <- p5 + geom_density(alpha = .5)
p5
#cvd_grid(p5)
# >>>>> 3.3.4 Change color scale of box plot -----------
p1 <- ggplot(data = Cars_filter,
aes(x = Kilowatts,
y = Retail_price,
fill = Cylinders))
p1 + geom_boxplot()+
scale_fill_brewer(palette = "Greens")
p1 + geom_boxplot() +
scale_fill_manual(values =c('#feebe2','#fbb4b9','#f768a1'))
p2 <- ggplot(data = Cars_filter, aes(x = Economy_city, y = Retail_price, colour = Economy_highway))
p2 + geom_point()
## Warning: Removed 11 rows containing missing values (geom_point).
p2 + geom_point() + scale_colour_gradient(low="blue",high="red")
## Warning: Removed 11 rows containing missing values (geom_point).
msnbc <- read.csv('msnbc.csv')
msnbc_sum <-msnbc %>% group_by(First) %>% summarise(count = n())
## `summarise()` ungrouping output (override with `.groups` argument)
msnbc_sum
## # A tibble: 17 x 2
## First count
## <chr> <int>
## 1 bbs 68605
## 2 business 17270
## 3 frontpage 306151
## 4 health 68897
## 5 living 18898
## 6 local 69999
## 7 misc 19143
## 8 msn-news 2598
## 9 msn-sports 2182
## 10 news 93067
## 11 on-air 173170
## 12 opinion 15225
## 13 sports 65471
## 14 summary 66798
## 15 tech 70973
## 16 travel 9477
## 17 weather 85828
msnbc_sum$Proportion <- msnbc_sum$count/nrow(msnbc)
msnbc_sum$Percent <- msnbc_sum$Proportion*100
msnbc_sum
## # A tibble: 17 x 4
## First count Proportion Percent
## <chr> <int> <dbl> <dbl>
## 1 bbs 68605 0.0595 5.95
## 2 business 17270 0.0150 1.50
## 3 frontpage 306151 0.265 26.5
## 4 health 68897 0.0597 5.97
## 5 living 18898 0.0164 1.64
## 6 local 69999 0.0607 6.07
## 7 misc 19143 0.0166 1.66
## 8 msn-news 2598 0.00225 0.225
## 9 msn-sports 2182 0.00189 0.189
## 10 news 93067 0.0807 8.07
## 11 on-air 173170 0.150 15.0
## 12 opinion 15225 0.0132 1.32
## 13 sports 65471 0.0567 5.67
## 14 summary 66798 0.0579 5.79
## 15 tech 70973 0.0615 6.15
## 16 travel 9477 0.00821 0.821
## 17 weather 85828 0.0744 7.44
p1 <- ggplot(msnbc_sum, aes(x = First, y = count))
p1 + geom_bar(stat = "identity")
# sorting , use - sign to specify descending order
msnbc_sum$First <- msnbc_sum$First %>%
factor(levels = msnbc_sum$First[order(-msnbc_sum$count)])
# y=count
p1<-ggplot(msnbc_sum,aes(x = First, y = count))
p1 + geom_bar(stat="identity")
# y= percentage
p2<-ggplot(msnbc_sum,aes(x = First, y = Percent))
p2 + geom_bar(stat="identity")
# Lables
p2 + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=45,hjust=1)) +
labs(title = "Unique Visits to Different MSNBC.com Landing Pages \n 28/09/1999",
y = "Percentage of Unique Visitors",
x = "Landing Page within MSNBC.com Domain")
p2 + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=45,hjust=1)) +
labs(title = "Unique Visits to Different MSNBC.com Landing Pages \n 28/09/1999",
y = "Percentage of Unique Visitors",
x = "Landing Page within MSNBC.com Domain") +
geom_text(aes(label=round(Percent,2)), vjust = -0.5,size = 3)
# Color
p2 + geom_bar(stat="identity",fill = "dodgerblue3" ) + theme_minimal() +
theme(axis.text.x=element_text(angle=45,hjust=1)) +
labs(title = "Unique Visits to Different MSNBC.com Landing Pages \n 28/09/1999",
y = "Percentage of Unique Visitors",
x = "Landing Page within MSNBC.com Domain") +
geom_text(aes(label=round(Percent,2)), vjust = -0.5,size = 3)
# Scale
msnbc_sum_filt <- msnbc_sum %>%
filter(First %in% c("tech", "local", "health", "bbs", "summary", "sports"))
p2.2 <- ggplot(msnbc_sum_filt, aes(x = First, y = Percent))
p2.2 + geom_bar(stat = "identity") + coord_cartesian(ylim=c(5.5,6.25))
# anchor the y-axis correctly at 0
p2.2 + geom_bar(stat = "identity")
# Dot plots -------------
p3 <- ggplot(msnbc_sum, aes(y = First, x = count))
p3 + geom_point()
# Sort
msnbc_sum$First <- factor(msnbc_sum$First, levels = msnbc_sum$First[order(msnbc_sum$count)])
p3 <- ggplot(msnbc_sum, aes(y = First, x = count))
p3 + geom_point()
# add trailing lines
p3 <- ggplot(msnbc_sum, aes(y = First, x = count))
p3 + geom_point() + geom_segment(aes(x = 0, y = First, xend = count,yend=First),linetype = 2)
p3 <- ggplot(msnbc_sum, aes(y = First, x = count))
p3 + geom_point(colour = "dodgerblue3") +
geom_segment(aes(x = 0, y = First, xend = count,yend=First),linetype = 2) +
labs(title = "Unique Visits to Different MSNBC.com \n Landing Pages \n 28/09/1999",
x = "No. of Unique Visitors",
y = "Landing Page within MSNBC.com Domain") +
geom_text(aes(label=round(count,2)), hjust = -.2,size = 3) +
scale_x_continuous(limits = c(0,350000))
# Pie chart -------------
p4 <- ggplot(msnbc_sum, aes(x = factor(1), y = count, fill = First))
p4 + geom_bar(stat="identity",width = 1) + coord_polar(theta = "y")
# filter
msnbc_sum_top_five <- msnbc_sum %>% filter(rank(count) > 12)
# Calculate proportions
msnbc_sum_top_five$Proportion <- msnbc_sum_top_five$count/sum(msnbc_sum_top_five$count)
msnbc_sum_top_five$Percent <- msnbc_sum_top_five$Proportion*100
msnbc_sum_top_five<-msnbc_sum_top_five %>% arrange(desc(count))
p5 <- ggplot(msnbc_sum_top_five, aes(x = factor(1), y = count, fill = First))
p5 + geom_bar(stat="identity", width = 1) + coord_polar(theta = "y")
# text alignment
p5 + geom_bar(stat="identity",width = 1) + coord_polar(theta = "y") +
theme(axis.ticks=element_blank(),
axis.title=element_blank(),
axis.text.y=element_blank(),
axis.text.x=element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank()) +
labs(fill = "Top Five Landing Pages") +
geom_text(aes(y = (cumsum(count)-count) +
(count/2),
label=round(Percent,2), angle = 0))
# Coxcomb diagram (polar area diagram)-----------
p6 <- ggplot(msnbc_sum_top_five, aes(x = First,y=count,fill=First))
p6 + geom_bar(stat="identity",width = 1) + coord_polar()
Histograms —————– # order and count quantitative variables into equal interval ranges, called bins. # The relative height of bars is used to represent the frequency of data points that fit within each bin. # Histograms are a useful and quick way to explore the distribution of a quantitative variable.
Youtube <- read.csv("Youtube.csv")
p6 <- ggplot(Youtube, aes(x = duration))
p6 + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# find actual bin widths and counts are for each bin
p6 <- p6 + geom_histogram()
hist <- ggplot_build(p6)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#hist$data
# outlier detection
p7 <- ggplot(Youtube, aes(x = factor(1), y = duration))
p7 + geom_boxplot(width = .25)
# removing outliers
# extreme lower outlier < Q1 - IQR*3
# extreme upper outlier > Q3 + IQR*3
p7 <- p7 + geom_boxplot(width = .25)
box <- ggplot_build(p7)
box$data[[1]][1:5]
## ymin lower middle upper ymax
## 1 1 52 139 281 624
Youtube_clean<-filter(Youtube, duration > 1 & duration < 624)
p8 <- ggplot(Youtube_clean, aes(x = factor(1), y = duration))
p8 + geom_boxplot(width = .25)
p9 <- ggplot(Youtube_clean, aes(x = duration))
p9 + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# setting the bins outline to white
p9 + geom_histogram(colour = "white", bins = 100)
# Creating a Density Plot------
p9 + geom_density(fill = "grey") +
geom_histogram(colour="white",aes(duration,..density..),
alpha = 1/2,bins=100)
# To ensure the histogram and density plot share the same scale, we use the ..density.. aesthetic option in geom_histogram()
p9 + geom_density(fill = "dodgerblue", alpha = 1/2) +
geom_histogram(colour="white",aes(duration,..density..),
alpha = 1/2,bins = 100)
# Adding markers and annotations
p9 <- p9 + geom_density(fill = "dodgerblue", alpha = 1/2) +
geom_histogram(colour="white",aes(duration,..density..),
alpha = 1/2,bins = 100) +
geom_vline(xintercept= median(Youtube_clean$duration)) +
annotate("text",label = "Median",x = 190, y = 0.006) +
geom_vline(xintercept= mean(Youtube_clean$duration),linetype=2) +
annotate("text",label = "Mean",x = 240, y = 0.004)
p9
# Creating a Violin Plot--------------
p10 <- ggplot(Youtube_clean,aes(x=factor(1),y = duration))
p10 + geom_violin(width = .25,fill="grey")
p10 <- ggplot(Youtube_clean,aes(x=factor(1),y = duration))
p10 + geom_violin(width = .25, fill="grey") + geom_boxplot(width = .25, alpha = .25)
# Stacked Dot Plots------------
p11 <- ggplot(Youtube_clean,aes(x = duration))
p11 + geom_dotplot()
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
p11 <- ggplot(Youtube_clean,aes(x = duration))
p11 + geom_dotplot(binwidth = 10) +
theme(axis.text.y= element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank())
# take a random sample of n = 50 and plot the sample's distribution of duration
set.seed(462243) #Set the random seed to replicate the plot below
p11 <- ggplot(sample_n(Youtube_clean,50),aes(x = duration))
p11 + geom_dotplot() +
theme(axis.text.y= element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank())
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
#Juxtaposing-----------
p8 <- ggplot(Youtube_clean, aes(x = factor(1), y = duration)) +
geom_boxplot(width = .50) + scale_y_continuous(limits = c(0, 800))
p9 <- ggplot(Youtube_clean, aes(x = duration)) +
geom_density(fill = "dodgerblue", alpha = 1/2) +
geom_histogram(colour="white",aes(duration,..density..),
alpha = 1/2,bins = 100) +
geom_vline(xintercept= median(Youtube_clean$duration)) +
annotate("text",label = "Median",x = 180, y = 0.006) +
geom_vline(xintercept= mean(Youtube_clean$duration),linetype=2) +
annotate("text",label = "Mean",x = 240, y = 0.004) +
scale_x_continuous(limits = c(0, 800))
theme_set(theme_gray())
plot_grid(p9, p8 + coord_flip() + theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y = element_blank()), ncol=1, align="v",
rel_heights = c(2,1))
## Warning: Removed 2 rows containing missing values (geom_bar).
# 4.1.3 Using visualisations to summarise two qualitative variables (4.1.3)————–
Hair_Eye_Colour <- read.csv("Hair_Eye_Colour.csv")
str(Hair_Eye_Colour)
## 'data.frame': 592 obs. of 3 variables:
## $ Hair : chr "Black" "Black" "Black" "Black" ...
## $ Eyes : chr "Brown" "Brown" "Brown" "Brown" ...
## $ Gender: chr "Male" "Male" "Male" "Male" ...
# Summarise quantitative variables
crosstab1 <- table(Hair_Eye_Colour$Hair,Hair_Eye_Colour$Eyes, dnn = c("Hair","Eyes"))
crosstab1
## Eyes
## Hair Blue Brown Green Hazel
## Black 20 68 5 15
## Blonde 94 7 16 10
## Brown 84 119 29 54
## Red 17 26 14 14
margin.table(crosstab1,1) #Row marginals
## Hair
## Black Blonde Brown Red
## 108 127 286 71
margin.table(crosstab1,2) #Column marginals
## Eyes
## Blue Brown Green Hazel
## 215 220 64 93
# barchart
p12 <- ggplot(data = Hair_Eye_Colour, aes(x = Hair, fill = Eyes))
p12 + geom_bar()
p12 + geom_bar(position = "fill")
#converts the counts to proportions
p12 + geom_bar(position = "dodge")
#Create crosstabulation
crosstab1<-table(Hair_Eye_Colour$Hair,Hair_Eye_Colour$Eyes, dnn = c("Hair","Eyes"))
prop.table(crosstab1, 1) #Row proportions
## Eyes
## Hair Blue Brown Green Hazel
## Black 0.18518519 0.62962963 0.04629630 0.13888889
## Blonde 0.74015748 0.05511811 0.12598425 0.07874016
## Brown 0.29370629 0.41608392 0.10139860 0.18881119
## Red 0.23943662 0.36619718 0.19718310 0.19718310
prop.table(crosstab1, 2) #Column proportions
## Eyes
## Hair Blue Brown Green Hazel
## Black 0.09302326 0.30909091 0.07812500 0.16129032
## Blonde 0.43720930 0.03181818 0.25000000 0.10752688
## Brown 0.39069767 0.54090909 0.45312500 0.58064516
## Red 0.07906977 0.11818182 0.21875000 0.15053763
crosstab1 <- data.frame(prop.table(crosstab1, 1)) #Convert proportion table to df
str(crosstab1) #Data frame summary
## 'data.frame': 16 obs. of 3 variables:
## $ Hair: Factor w/ 4 levels "Black","Blonde",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ Eyes: Factor w/ 4 levels "Blue","Brown",..: 1 1 1 1 2 2 2 2 3 3 ...
## $ Freq: num 0.185 0.74 0.294 0.239 0.63 ...
colnames(crosstab1) <- c("Hair","Eyes","Proportion") #Fix variable names
str(crosstab1)
## 'data.frame': 16 obs. of 3 variables:
## $ Hair : Factor w/ 4 levels "Black","Blonde",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ Eyes : Factor w/ 4 levels "Blue","Brown",..: 1 1 1 1 2 2 2 2 3 3 ...
## $ Proportion: num 0.185 0.74 0.294 0.239 0.63 ...
p13 <- ggplot(data = crosstab1, aes(x = Hair, y = Proportion, fill = Eyes))
p13 + geom_bar(stat = "identity",position = "dodge") +
labs(y = "Proportion within Hair Colour")
p13 + geom_bar(stat = "identity",position = "dodge") +
labs(y = "Proportion within Hair Colour") +
scale_fill_manual(values = c("#1569C7","#94703D","#566638","#6B7E47"))
# Mosaic plots-------------
Hair_Eye_Colour$Hair <- as.factor(Hair_Eye_Colour$Hair )
Hair_Eye_Colour$Eyes <- as.factor(Hair_Eye_Colour$Eyes)
Hair_Eye_Colour$Gender <- as.factor(Hair_Eye_Colour$Gender)
vcd::mosaic(~ Hair + Eyes, data = Hair_Eye_Colour, dnn = c("Hair","Eyes"),
shade=TRUE, pop = FALSE)
crosstab1<-table(Hair_Eye_Colour$Hair,Hair_Eye_Colour$Eyes, dnn = c("Hair","Eyes"))
labs<-round(prop.table(crosstab1,1),2)
labs
## Eyes
## Hair Blue Brown Green Hazel
## Black 0.19 0.63 0.05 0.14
## Blonde 0.74 0.06 0.13 0.08
## Brown 0.29 0.42 0.10 0.19
## Red 0.24 0.37 0.20 0.20
vcd::mosaic(crosstab1, pop = FALSE, legend=TRUE,shade=TRUE)
labeling_cells(text = labs, margin=0)(crosstab1)
tb <- table(Hair_Eye_Colour$Hair, Hair_Eye_Colour$Eyes)
tb <- data.frame(tb)
colnames(tb) <- c("Hair", "Eyes", "Freq")
tb
## Hair Eyes Freq
## 1 Black Blue 20
## 2 Blonde Blue 94
## 3 Brown Blue 84
## 4 Red Blue 17
## 5 Black Brown 68
## 6 Blonde Brown 7
## 7 Brown Brown 119
## 8 Red Brown 26
## 9 Black Green 5
## 10 Blonde Green 16
## 11 Brown Green 29
## 12 Red Green 14
## 13 Black Hazel 15
## 14 Blonde Hazel 10
## 15 Brown Hazel 54
## 16 Red Hazel 14
p14 <- ggplot(tb)
p14 + geom_mosaic(aes(x = product(Hair), weight = Freq, fill = Eyes)) + labs(x = "Hair Colour")
levVar1 <- length(levels(Hair_Eye_Colour$Hair))
jointTable <- prop.table(table(Hair_Eye_Colour$Hair, Hair_Eye_Colour$Eyes))
plotData <- as.data.frame(jointTable)
plotData$marginVar1 <- prop.table(table(Hair_Eye_Colour$Hair))
plotData$var2Height <- plotData$Freq / plotData$marginVar1
plotData$var1Center <- c(0, cumsum(plotData$marginVar1)[1:levVar1 -1]) +
plotData$marginVar1 / 2
df<-data.frame(prop.table(table(Hair_Eye_Colour$Hair,Hair_Eye_Colour$Eyes),1))
df<-group_by(df,Var1)
df<-transmute(df,
csum = (cumsum(Freq)-Freq)+(Freq/2))
plotData$centerlab <- df$csum
p14 + geom_mosaic(aes(x = product(Hair), weight = Freq, fill = Eyes)) +
labs(x = "Hair Colour", y = "Eye Colour Proportion within Hair Colour") +
geom_text(data = plotData, aes(x = var1Center, y = centerlab,label=round(var2Height,2)),
inherit.aes = FALSE)
# 4.1.4 Quantitative bivariate visualisation————
Body <- read.csv("Body.csv")
ggpairs(Body, columns = c(3,6,7,10:19),axisLabels = "internal")
p3 <- ggplot(data = Body, aes(x = Abdomen, y = BFP_Siri))
p3 + geom_point()
p3 + geom_point() + geom_smooth(method = "lm") + geom_smooth(colour = "red")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# add rug plots to help visualise the univariate distributions of the two variables in the plot
p3 + geom_point() + geom_smooth(method = "lm") + geom_smooth(colour = "red") +
geom_rug(alpha = 1/2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
p3 + geom_point() + geom_density2d()
# time series
economics <- read.csv("economics.csv")
economics$date <- as.Date(economics$date, format = "%d/%m/%Y")
p4 <-ggplot(data = economics, aes(x = date, pop))
p4 + geom_line() + labs(title = "US Population Growth 1967 - 2015")
# (unpivot) stack all the variables and their values into two columns
#Load tidyr to access gather() function
economics_l <- gather(economics, # Data frame
Variable, # Name of the variable to contain the original variable names
Value, # Name of the variable to contain the variables' values
pce:unemploy) # The variables to be merged into long format
economics_l$Variable <- factor(economics_l$Variable, # Define and label variable factor
labels = c("PCE",
"Population '000",
"PSR",
"Unemployed '000",
"Unemployed Duration"
))
#economics_l
p4 <-ggplot(data = economics_l, aes(x = date, y = Value))
p4 + geom_line() + facet_grid(Variable ~ ., scales = "free",
labeller = label_value) +
labs(title = "US Economic Data 1967 - 2015", y = "")
#Reduce resolution
#Add a month and year variable to the wide economics dataset
economics <- mutate(economics,
quarter = quarters(date),
year = format(economics$date, "%Y"))
#Group the data by year and quarter
economics_ag<-group_by(economics, year, quarter)
#Create a summarised dataset with mean values for yearly quarters
#Save the date for the last day of each quarter
economics_ag <- summarise(economics_ag,
pce = mean(pce),
pop = mean(pop),
psavert = mean(psavert),
uempmed = mean(uempmed),
unemploy = mean(unemploy),
date = max(date))
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
#Restructure data to long format
economics_ag_l <- gather(economics_ag,
Variable,
Value,
pce:unemploy)
#Assign factor and labels
economics_ag_l$Variable <- factor(economics_ag_l$Variable, #Define and label variable factor
labels = c("PCE",
"Population '000",
"PSR",
"Unemployed '000",
"Unemployed Duration"))
economics_ag_l
## # A tibble: 925 x 5
## # Groups: year [49]
## year quarter date Variable Value
## <chr> <chr> <date> <fct> <dbl>
## 1 1967 Q1 1967-01-12 PCE 515.
## 2 1968 Q1 1968-01-12 PCE 557.
## 3 1969 Q1 1969-01-12 PCE 604.
## 4 1970 Q1 1970-03-01 PCE 633.
## 5 1970 Q2 1970-06-01 PCE 643.
## 6 1970 Q3 1970-09-01 PCE 654.
## 7 1970 Q4 1970-12-01 PCE 661.
## 8 1971 Q1 1971-03-01 PCE 680.
## 9 1971 Q2 1971-06-01 PCE 694.
## 10 1971 Q3 1971-09-01 PCE 707.
## # ... with 915 more rows
p5<-ggplot(data = economics_ag_l, aes(x = date, y = Value))
p5 + geom_line() + facet_grid(Variable ~ ., scales = "free",
labeller = label_value) +
labs(title = "US Economic Quarterly Data 1967 - 2015 ", y = "")
#One quantitative and one qualitative variable
mpg <- read.csv("mpg.csv")
p6 <- ggplot(data = mpg, aes(x = class, y = cty))
p6 + geom_boxplot()
#order categories from lowest to highest
mpg_rank <- mpg %>% group_by(class) %>% summarise(med = median(cty))
## `summarise()` ungrouping output (override with `.groups` argument)
mpg$class <- mpg$class %>% factor(levels = mpg_rank$class[order(-mpg_rank$med)])
p6 <- ggplot(data = mpg, aes(x = class, y = cty))
p6 + geom_boxplot()
# Horizontal presentation
p6 + geom_boxplot() + coord_flip()
p6 + geom_violin() +
stat_summary(fun.y = "mean", geom = "point", colour = "red")
## Warning: `fun.y` is deprecated. Use `fun` instead.
# jittered point plots with means:
p6 + geom_jitter(width = .2, alpha = .25) +
stat_summary(fun.y = "mean", geom = "point", colour = "red")
## Warning: `fun.y` is deprecated. Use `fun` instead.
p6 + geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 1/2, alpha = .25) +
stat_summary(fun.y = "mean", geom = "point", colour = "red")
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
# Use jittered point plots and dot plots when your sample size is small.
# Box plots and density plots are better when sample size is large.
FEV = read.csv("FEV.csv")
p1 <- ggplot(data = FEV, aes(x = smoking, y = FEV))
p1 + geom_boxplot() + stat_summary(fun.y = "mean", geom = "point",
colour = "red") +
stat_summary(fun.data = "mean_cl_boot", colour = "red",
geom = "errorbar", width = .2)
## Warning: `fun.y` is deprecated. Use `fun` instead.
FEV = read.csv("FEV.csv")
p1 <- ggplot(data = FEV, aes(x = smoking, y = FEV))
p1 + geom_boxplot() + stat_summary(fun.y = "mean", geom = "point",
colour = "red") +
stat_summary(fun.data = "mean_cl_boot", colour = "red",
geom = "errorbar", width = .2)
## Warning: `fun.y` is deprecated. Use `fun` instead.
GGally package
ggpairs(FEV, columns = 1:5,axisLabels = "internal")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Scatterplot
p2 <- ggplot(data = FEV, aes(x = height, y = FEV, colour = smoking))
p2 + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
Faceted scatter plot
p3 <- ggplot(data = FEV, aes(x = height, y = FEV, colour = smoking))
p3 + geom_point() + geom_smooth(method = "lm") + facet_grid(. ~ sex)
## `geom_smooth()` using formula 'y ~ x'
Another strategy would be to convert age to a binary variable such as children (< 10) and adolescents (10–19). This would allow you to use a double facet and squeeze one more variable into the plot. Use this code:
FEV$age_cat <- ifelse(FEV$age < 10, "Children", "Adolescent")
p3.3 <- ggplot(data = FEV,
aes(x = height, y = FEV, colour = smoking))
p3.3 + geom_point() + geom_smooth(method = "lm") + facet_grid(age_cat ~ sex)
## `geom_smooth()` using formula 'y ~ x'
# 4.2.2 Mapping additional aesthetics
simple bivariate scatterplot
gapminder2007 <- gapminder %>% filter(year == 2007)
p1 <- ggplot(gapminder2007, aes(x = gdpPercap, y = lifeExp))
p1 + geom_point() +
labs(x = "GDP Per Capita", y = "Life Expectancy",
title = "Country GDP per capita predicts life expectancy (2007)")
p1 + geom_point(aes(size = pop)) +
labs(x = "GDP Per Capita",
y = "Life Expectancy",
title = "Country GDP per capita predicts life expectancy (2007)") +
scale_size(name = "Pop Size")
Now the population size of a country is portrayed by the size of the point. Size lacks accuracy, but it does a good job in this example of highlighting the large degree of variability in country size.
We will now add a fourth variable by mapping the continent to a colour aesthetic.
p1 + geom_point(aes(size = pop, colour = continent)) +
labs(x = "GDP Per Capita",
y = "Life Expectancy",
title = "Country GDP per capita predicts life expectancy (2007)") +
scale_size(name = "Pop Size") + scale_color_discrete(name = "Continent")
studentInfo <- read.csv("studentInfo.csv")
studentInfo$highest_education <- studentInfo$highest_education %>%
factor(levels = c("No Formal quals","Lower Than A Level",
"A Level or Equivalent", "HE Qualification",
"Post Graduate Qualification"),
ordered = TRUE)
studentInfo$final_result <- studentInfo$final_result %>%
factor(levels = c("Withdrawn", "Fail", "Pass","Distinction"),
ordered =TRUE)
studentInfo$gender <- studentInfo$gender %>%
factor(levels = c("F","M"),
labels = c("Female","Male"))
Suppose we are interested in understanding how a student’s previous qualifications, highest_education, are related to avg_grade. We can start with a simple side-by-side box plot. Use this code:
p2 <- ggplot(data = studentInfo, aes(x = highest_education, y = avg_grade))
p2 + geom_boxplot() +
labs(y = "Average Grade", x = "Highest Education") +
coord_flip()
## Warning: Removed 5866 rows containing non-finite values (stat_boxplot).
Fill gender
p2 + geom_boxplot(aes(fill = gender)) +
labs(y = "Average Grade", x = "Highest Education") +
coord_flip() +
theme(legend.title=element_blank())
## Warning: Removed 5866 rows containing non-finite values (stat_boxplot).
Do students who have higher average grades complete more courses and have higher educational backgrounds? Things are starting to get trickier. This trivariate visualisation includes two quantitative variables and one qualitative variable. The size aesthetic can be used to add an additional quantitative variable. In this visualisation we will map courses to the size aesthetic. Use this code:
p3 <- ggplot(data = studentInfo,
aes(x = highest_education, y = avg_grade,
size = courses))
p3 + geom_point(position = "jitter",alpha = .25) +
scale_size(name = "Courses Finished") +
labs(y = "Average Grade", x = "Highest Education") +
theme(axis.text.x=element_text(angle=45,hjust=1))
## Warning: Removed 5866 rows containing missing values (geom_point).
side by side box plots
p4 <- ggplot(data = studentInfo,
aes(x = highest_education, y = courses))
p4 + geom_boxplot() +
labs(y = "Courses Completed", x = "Highest Education") +
coord_flip()
## Warning: Removed 5847 rows containing non-finite values (stat_boxplot).
Colour (discrete)
p5 <- ggplot(data = studentInfo,
aes(x = courses, y = avg_grade,colour = highest_education))
p5 + geom_point(position = "jitter") +
scale_color_brewer(type = "seq", palette = "YlOrRd") +
labs(y = "Average Grade", x = "Courses Finished")
## Warning: Removed 5866 rows containing missing values (geom_point).
Colour-continuous (heatmaps)
studentInfo_hm <- studentInfo %>% group_by(region, highest_education)
studentInfo_hm <- studentInfo_hm %>% summarise(count = n(),
mean = mean(avg_grade, na.rm = TRUE))
## `summarise()` regrouping output by 'region' (override with `.groups` argument)
Heat map
p7 <- ggplot(data = studentInfo_hm, aes(x = highest_education,
y = region,
fill = mean))
p7 + geom_raster() + labs(y = "Region", x = "Highest Education") +
scale_fill_continuous(name="Average\nGrade") +
theme(axis.text.x=element_text(angle=45,hjust=1))
ordering regions
performance <- studentInfo %>% group_by(region) %>%
summarise(mean = mean(avg_grade, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
performance %>% arrange(mean)
## # A tibble: 13 x 2
## region mean
## <chr> <dbl>
## 1 London Region 71.1
## 2 North Western Region 71.1
## 3 Yorkshire Region 71.6
## 4 Wales 71.9
## 5 West Midlands Region 72.4
## 6 South West Region 72.6
## 7 East Midlands Region 72.7
## 8 East Anglian Region 73.2
## 9 Ireland 73.4
## 10 South Region 73.8
## 11 North Region 73.8
## 12 Scotland 74.4
## 13 South East Region 74.7
p7 <- ggplot(data = studentInfo_hm, aes(x = highest_education,
y = region, fill = mean))
p7 + geom_raster() + labs(y = "Region", x = "Highest Education") +
scale_fill_continuous(name="Average\nGrade") + theme(axis.text.x=element_text(angle=45,hjust=1))
p8 <- ggplot(data = studentInfo, aes(x = highest_education,
y = avg_grade,
fill = gender))
p8 + geom_boxplot() + labs(y = "Average Grade", x = "Highest Education") +
theme(legend.title=element_blank()) + coord_flip()
## Warning: Removed 5866 rows containing non-finite values (stat_boxplot).
using facets
p8 + geom_boxplot() + labs(y = "Average Grade", x = "Highest Education") +
theme(legend.title=element_blank()) + coord_flip() +
facet_wrap(~ region)
## Warning: Removed 5866 rows containing non-finite values (stat_boxplot).
if you want to align visualisations in rows or columns to facilitate accurate comparisons, you can use facet_grid. facet_grid(. ~ var) will align facets as columns and facet_grid(var ~ .) as rows. Use this code:
p8 + geom_boxplot() + labs(y = "Average Grade", x = "Highest Education") +
theme(legend.title=element_blank()) + coord_flip() +
facet_grid(region ~ .)
## Warning: Removed 5866 rows containing non-finite values (stat_boxplot).
Use with caution: as a rule of thumb, you should never use more than two variables in a facet.
p8 + geom_boxplot() + labs(y = "Average Grade", x = "Highest Education") +
theme(legend.title=element_blank()) + coord_flip() +
facet_grid(region ~ age_band)
## Warning: Removed 5866 rows containing non-finite values (stat_boxplot).
# 4.2.5 Purpose-built visualisations
StudentPreference<-read.csv("StudentPreference.csv")
StudentPreference_sk_1 <- table(StudentPreference$Pref_1,StudentPreference$Pref_2)
StudentPreference_sk_2 <- table(StudentPreference$Pref_2,StudentPreference$Pref_3)
StudentPreference_sk_3 <- table(StudentPreference$Pref_3,StudentPreference$Pref_4)
StudentPreference_sk_1 <- data.frame(StudentPreference_sk_1)
StudentPreference_sk_2 <- data.frame(StudentPreference_sk_2)
StudentPreference_sk_3 <- data.frame(StudentPreference_sk_3)
StudentPreference_sk_1$Var1 <- paste("1.",StudentPreference_sk_1$Var1)
StudentPreference_sk_1$Var2 <- paste("2.",StudentPreference_sk_1$Var2)
StudentPreference_sk_2$Var1 <- paste("2.",StudentPreference_sk_2$Var1)
StudentPreference_sk_2$Var2 <- paste("3.",StudentPreference_sk_2$Var2)
StudentPreference_sk_3$Var1 <- paste("3.",StudentPreference_sk_3$Var1)
StudentPreference_sk_3$Var2 <- paste("4.",StudentPreference_sk_3$Var2)
StudentPreference_sk<-rbind(StudentPreference_sk_1,StudentPreference_sk_2,StudentPreference_sk_3)
sk1 <- gvisSankey(StudentPreference_sk, from='Var1', to='Var2', weight='Freq',
options=list(height=600, width=800))
plot(sk1)
## starting httpd help server ... done
vic.lga.shp <- readShapeSpatial("vmlite_lga_cm/vmlite_lga_cm.shp")
## Warning: readShapeSpatial is deprecated; use rgdal::readOGR or sf::st_read
## Warning: readShapePoly is deprecated; use rgdal::readOGR or sf::st_read
class(vic.lga.shp)
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
names(vic.lga.shp)
## [1] "ufi" "ftype_code" "lga_name" "state" "scale_usec"
## [6] "labeluse_c" "ufi_cr" "lga_name3" "cartodb_id" "created_at"
## [11] "updated_at"
# class sp (SpatialPolygonDataFrame) with 11 variables
#head(vic.lga.shp$lga_name)
# The code verifies 87 lga_names, which is higher than the expected 79. This is because the shp file also includes some islands, resort regions and repeated LGA names.
lga_profiles_data_2011_pt1 <- read.csv("lga_profiles_data_2011_pt1.csv")
#head(lga_profiles_data_2011_pt1$lga_name)
# To merge the shp file and the lga_profiles_data_2011_pt1 data frame, you need to first use the tidy function from the broom package to convert the shp file to a data.frame. This will make it easy to merge with ga_profiles_data_2011_pt1.
lga.shp.f <- tidy(vic.lga.shp, region = "lga_name")
#head(lga.shp.f)
lga.shp.f$lga_name <-lga.shp.f$id # Rename lga name to id
#head(lga.shp.f)
# Merge the profiles
merge.lga.profiles<-merge(lga.shp.f, lga_profiles_data_2011_pt1,
by="lga_name", all.x=TRUE)
# Order the data frame: This will ensure the polygons are drawn correctly in the ggplot object.
choro.data.frame<-merge.lga.profiles[order(merge.lga.profiles$order), ]
p1 <- ggplot(data = choro.data.frame,
aes(x = long, y = lat, group = group,
fill = notifications_per_1_000_people_of_pertussis))
p1 + geom_polygon(color = "black", size = 0.25) +
coord_map()
# Customise your plot
p1 + geom_polygon(color = "black", size = 0.25) +
coord_map() +
scale_fill_distiller(name = "Cases \n per 1,000",
guide = "legend",
palette = "YlOrRd", direction = 1) +
theme_minimal() + theme(axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
panel.grid = element_blank()) +
labs(title="Victorian LGA Pertussis Cases - 2011")
class(vic.lga.shp)
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
p2 <- leaflet(vic.lga.shp) %>%
setView(lng = 145.5, lat = -36.5, zoom = 6)
p2 %>% addPolygons()
# Merge LGA profile data
merge.lga.profiles3<-sp::merge(vic.lga.shp, lga_profiles_data_2011_pt1,
by="lga_name", duplicateGeoms = TRUE)
# Create a colour scale
bins <- quantile(
lga_profiles_data_2011_pt1$notifications_per_1_000_people_of_pertussis,
probs = seq(0,1,.2), names = FALSE, na.rm = TRUE)
bins
## [1] 0.09988014 0.76654278 1.04295663 1.40278076 2.07256177 5.59552358
ggplot(data = lga_profiles_data_2011_pt1,
aes(x = notifications_per_1_000_people_of_pertussis)) +
geom_histogram(colour = "white", bins = 40) +
geom_vline(
xintercept = quantile(
lga_profiles_data_2011_pt1$notifications_per_1_000_people_of_pertussis,
probs = seq(0,1,0.2), na.rm = TRUE),
colour = "red", lwd = 1, lty = 2)
## Warning: Removed 1 rows containing non-finite values (stat_bin).
pal <- colorBin(
"YlOrRd",
domain = lga_profiles_data_2011_pt1$notifications_per_1_000_people_of_pertussis,
bins = bins
)
# Apply the colour scale
p3 <- leaflet(merge.lga.profiles3) %>%
setView(lng = 147, lat = -36.5, zoom = 6)
p3 %>% addPolygons(
fillColor = ~pal(notifications_per_1_000_people_of_pertussis),
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
fillOpacity = 0.7)
# Add highigliting
p3 %>% addPolygons(
fillColor = ~pal(notifications_per_1_000_people_of_pertussis),
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
fillOpacity = 0.7,
highlight = highlightOptions(
weight = 3,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE))
#Add any relevant variables
labels <- sprintf(
"%s
%g notifications / 1,000 people",
merge.lga.profiles3$lga_name,
merge.lga.profiles3$notifications_per_1_000_people_of_pertussis
) %>% lapply(htmltools::HTML)
p3 %>% addPolygons(
fillColor = ~pal(notifications_per_1_000_people_of_pertussis),
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
fillOpacity = 0.7,
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = labels,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto"))
# Add a title and legend
labels <- sprintf(
"%s
%g notifications / 1,000 people",
merge.lga.profiles3$lga_name,
merge.lga.profiles3$notifications_per_1_000_people_of_pertussis
) %>% lapply(htmltools::HTML)
library(htmlwidgets)
library(htmltools)
title <- tags$div(
HTML('<h3>Victorian LGA Pertussis Cases - 2011</h3>')
)
p3 %>% addPolygons(
fillColor = ~pal(notifications_per_1_000_people_of_pertussis),
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
fillOpacity = 0.7,
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = labels,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto")) %>%
addLegend(pal = pal,
values = ~notifications_per_1_000_people_of_pertussis,
opacity = 0.7, title = "Notifications /1,000 people",
position = "bottomright") %>%
addControl(title, position = "topright")